Kate’s DVG code
message("Loading packages")
Loading packages
library('ggplot2')
library('readr')
library('reshape2')
library('ggpubr')
library('plyr')
Attaching package: ‘plyr’
The following object is masked from ‘package:ggpubr’:
mutate
library('tidyverse')
── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.0 ✔ stringr 1.5.0
✔ forcats 1.0.0 ✔ tibble 3.1.8
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::arrange() masks plyr::arrange()
✖ purrr::compact() masks plyr::compact()
✖ dplyr::count() masks plyr::count()
✖ dplyr::desc() masks plyr::desc()
✖ dplyr::failwith() masks plyr::failwith()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::id() masks plyr::id()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::mutate() masks plyr::mutate(), ggpubr::mutate()
✖ dplyr::rename() masks plyr::rename()
✖ dplyr::summarise() masks plyr::summarise()
✖ dplyr::summarize() masks plyr::summarize()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library('dplyr')
library('glue')
library('ggVennDiagram')
# paramters used when running divrge
grouping_param = 5
match_length_param = 28
readLength = 150
# deletion read count cutoffs
count_cut = 30
# only looking at index and direct cases
keeping = c('index','direct')
message("Setting work directory and input file names")
Setting work directory and input file names
wkdir = "/Users/marissaknoll/Desktop/GitHub/Obesity/NewExtractions/H9N2/DVGs"
setwd(wkdir)
if (!dir.exists(glue("{wkdir}/DVG_figures"))) {
dir.create(glue("{wkdir}/DVG_figures"))
}
saveitdir = glue("{wkdir}/DVG_figures")
source(glue('{wkdir}/scripts/obese_PlotPrep.R'))
# loading in metadata and coverage data
metafile = glue("{wkdir}/scripts/transmission_h9n2_use_long.csv")
meta = read.csv(file=metafile,header=T,sep=",",na.strings = c(''))
meta = filter(meta, flag == "keep")
coverage_passfile = glue('{wkdir}/scripts/H9N2.coverage.pass.check.200.0.95.csv')
cov_check = read.csv(file=coverage_passfile,header=T,sep=",",na.strings = c(''))
# filter for samples that either pass with a yes OR has good average coverage and percentage cov at 200x is > 80
cov_filt_names = cov_check %>% filter(pass == 'YES' |
mean_coverage >= 200 |
percentage > 0.7) %>%
select(name, segment) %>%
unique()
cov_filt_names = filter(cov_filt_names, name != "2244_d12") %>% unique()
# check segment count
cov_filt_names = cov_filt_names %>% group_by(name) %>% add_tally(name = 'segment_tally') %>%
ungroup() %>%
filter(segment_tally == 8) %>%
unique()
pull_names = c(levels(factor(cov_filt_names$name))) # list to pull names from
dvgfile = glue('{wkdir}/H9N2.DVG.FINAL.OneGap.N5.Mis2.M28.G5.csv') # dvg file
dvg = read.csv(file=dvgfile,header=T,sep=",",na.strings = c(''))
dvg = dvg %>% filter(name %in% pull_names) # filter for samples that pass our coverage checks
dvg$sample = dvg$name # generate new column so we can separate
dvg = dvg %>% separate(sample, c('new','cohort','ferret_id','dpi','rep'), '_') # separate into info
Warning: Expected 5 pieces. Missing pieces filled with `NA` in 40841 rows [200533, 200534, 200535, 200536, 200537, 200538, 200539, 200540, 200541, 200542, 200543, 200544, 200545, 200546, 200547, 200548, 200549, 200550, 200551, 200552, ...].
CONTROLS = dvg %>% filter(ferret_id == 'HK1073') # pulling out controls
CONTROLS$rep = CONTROLS$dpi
CONTROLS$dpi = 'stock' # adding in stock info
dvg = dvg %>% filter(!name %in% c(levels(factor(CONTROLS$name)))) %>% unique()
dvg = rbind(dvg, CONTROLS) # rbind everything so it is all in one dataframe
# prepping rep information
dvg = dvg %>% select(-SegTotalDVG) %>% filter(DVG_freq >= count_cut) %>% unique() # filter for those that pass cutoffs
rep1 = dvg %>% filter(rep == 'rep1') %>% unique()
rep2 = dvg %>% filter(rep == 'rep2') %>% unique()
# merge reps into one
dvg_reps = merge(rep1, rep2, by = c('cohort','ferret_id','dpi',
'segment','segment_size','strain',
'DVG_group','NewGap',
'NewStart','NewEnd','GroupBoundaries',
'DeletionSize','EstimatedFragLength'), all = TRUE) %>% unique()
# add in zeros
dvg_reps$DVG_freq.x[is.na(dvg_reps$DVG_freq.x)] = 0
dvg_reps$DVG_freq.y[is.na(dvg_reps$DVG_freq.y)] = 0
ggplot(dvg_reps, aes(x=DVG_freq.x, y=DVG_freq.y)) +
geom_point() +
PlotTheme1

# number of samples?
levels(factor(dvg_reps$ferret_id)) %>% length()
[1] 42
# reorder by segment size
SEGMENTS = c('H9N2_PB2', 'H9N2_PB1',
'H9N2_PA','H9N2_HA','H9N2_NP',
'H9N2_NA','H9N2_MP','H9N2_NS')
cov_check$segment = factor(cov_check$segment, levels = SEGMENTS)
cov_check %>%
filter(name %in% pull_names) %>%
ggplot(., aes(x= segment, y = mean_coverage)) +
geom_boxplot() +
PlotTheme1

cov_check %>%
filter(name %in% pull_names) %>%
ggplot(., aes(x= segment, y = median_coverage)) +
geom_boxplot() +
PlotTheme1

cov_check %>%
filter(name %in% pull_names) %>%
ggplot(., aes(x= segment, y = percentage)) +
geom_boxplot() +
PlotTheme1

df = cov_filt_names %>% select(-segment, -segment_tally) %>% unique()
df$sample = df$name
df = df %>% separate(sample, c('new','cohort','ferret_id','dpi','rep'), '_')
Warning: Expected 5 pieces. Missing pieces filled with `NA` in 10 rows [34, 35, 49, 71, 84, 153, 165, 170, 187, 201].
CONTROLS = df %>% filter(ferret_id == 'HK1073')
CONTROLS$rep = CONTROLS$dpi
CONTROLS$dpi = 'stock'
df = df %>% filter(!name %in% c(levels(factor(CONTROLS$name)))) %>% unique()
df = rbind(df, CONTROLS)
r1 = df %>% filter(rep == 'rep1') %>% select(-new) %>% unique()
r2 = df %>% filter(rep == 'rep2') %>% select(-new) %>% unique()
reps = merge(r1, r2, by = c('cohort','ferret_id','dpi')) %>% unique()
# these are the samples that only had one rep!
setdiff(levels(factor(r1$ferret_id)),
levels(factor(r2$ferret_id)))
[1] "1411" "1972" "2244"
setdiff(meta$ferret_id, reps$ferret_id) # samples in meta not in seq data
[1] "1793" "1803" "1788" "1805" "1795" "1790" "1796" "1911" "1982" "1978" "1985" "1979" "1971" "1987" "1967" "1976" "1915" "2230" "2235" "2237"
[21] "2242" "2241" "2238"
setdiff(reps$ferret_id, meta$ferret_id) # samples in seq data not in meta
[1] "1966" "1969" "1968" "2243" "1408" "1409" "1410" "1412" "1414" "1415" "1416" "1417"
m = merge(reps, meta, by = c('ferret_id'), all = TRUE)
write.csv(m, glue('{wkdir}/scripts/UPDATED.H9N2.metadata.csv'), row.names = FALSE)
temp = meta %>%
filter(type %in% c('index','direct')) %>%
unique() %>%
select(pair, type, ferret_id) %>% unique() %>%
pivot_wider(names_from = 'type', values_from = 'ferret_id')
temp = rbind(temp, c('Z','stock','stock'))
temp$pair_ids = paste0(temp$index,'>',temp$direct)
temp = temp %>% select(pair, pair_ids) %>% unique()
m = merge(m, temp, by = c('pair')) # add in additional information to metadata to work with
# type check - only stock index direct
print(levels(factor(m$type)))
[1] "aerosol" "direct" "index" "secondary" "stock"
m$type = factor(m$type, levels = c('stock','index','direct'))
m = m %>% filter(name.x != is.na(name.x)) %>% unique()
p1 = m %>% filter(ferret_id != 'HK1073') %>% unique() %>%
ggplot(., aes(x= dpi, y = pair_ids, fill = ferret_weight)) +
geom_tile(color = 'black') +
PlotTheme3 +
weight_colFill +
facet_grid(index_direct~type, scales = 'free', space = 'free')
print(p1)
ggsave(p1,
filename = glue("{wkdir}/DVG_figures/final.samples.pdf"),
width = 6,
height = 6, limitsize=FALSE, useDingbats = FALSE)
ggsave(p1,
filename = glue("{wkdir}/DVG_figures/final.samples.png"),
width = 6,
height = 6, limitsize=FALSE) #, useDingbats = FALSE)

dvg_reps = dvg_reps %>%
filter(DVG_freq.x > count_cut & DVG_freq.y > count_cut) %>%
unique() # make sure that both reps pass our cutoff
# add in variables for plotting
dvg_reps$ferret_day = paste0(dvg_reps$ferret_id, '_', dvg_reps$dpi)
m$ferret_day = paste0(m$ferret_id, '_', m$dpi)
stock_temp = dvg_reps %>% filter(dpi == 'stock') %>%
group_by(ferret_id, cohort, dpi, segment, name.x, name.y) %>%
add_tally(name = 'seg_deletion_richness') %>%
unique() %>%
ungroup() %>%
group_by(ferret_id, dpi, name.x, name.y, cohort) %>%
add_tally(name = 'deletion_richness') %>%
ungroup() %>%
unique()
s = stock_temp # will use later
# filter down stock temp information
stock_temp = stock_temp %>%
select(ferret_id, dpi, cohort,ferret_day, segment, deletion_richness, seg_deletion_richness) %>%
unique()
stock_temp = merge(stock_temp, m, by = c('ferret_id', 'dpi','cohort','ferret_day')) %>%
unique()
# filter out stock information, calculate dvg richness by segment and across genome for samples
dr = dvg_reps %>%
filter(dpi != 'stock') %>%
unique() %>%
group_by(ferret_id, dpi, segment, name.x, name.y, cohort) %>%
add_tally(name = 'seg_deletion_richness') %>%
ungroup() %>%
group_by(ferret_id, dpi, name.x, name.y, cohort) %>%
add_tally(name = 'deletion_richness') %>%
ungroup() %>%
unique()
# filter down information so you don't have duplicates
richness = dr %>%
select(ferret_id, dpi, cohort,ferret_day, segment, deletion_richness, seg_deletion_richness) %>%
unique()
# merge with metadata info
richness = merge(richness, m, by = c('ferret_id', 'dpi','cohort','ferret_day'), all.y = TRUE) %>%
unique()
# make sure we filter out stock information (will add using the 's' dataframe generated above)
richness = richness %>% filter(dpi != 'stock')
reps_df = rbind(dr, s) # final reps richness df
reps_df = merge(reps_df, m, by = c('ferret_id','dpi','cohort','ferret_day')) # add metadata
p4 = reps_df %>%
select(segment, NewGap, EstimatedFragLength, ferret_weight) %>%
unique() %>%
ggplot(., aes(x= EstimatedFragLength)) +
geom_histogram(color = 'black') +
PlotTheme1 +
labs(x="estimated DVG frag. length (nt)", y='number of unique DVG species')
print(p4)
ggsave(p4,
filename = glue("{wkdir}/DVG_figures/deletion.size.pdf"),
width = 5,
height = 5, limitsize=FALSE, useDingbats = FALSE)
ggsave(p4,
filename = glue("{wkdir}/DVG_figures/deletion.size.png"),
width =5,
height = 5, limitsize=FALSE) #, useDingbats = FALSE)

p4_alt = reps_df %>%
select(segment, NewGap, EstimatedFragLength, ferret_weight, type) %>%
unique() %>%
ggplot(., aes(x= EstimatedFragLength)) +
geom_histogram(color = 'black', binwidth = 50) +
facet_grid(type~ferret_weight) +
PlotTheme1 +
labs(x="estimated DVG frag. length (nt)", y='number of unique DVG species')
print(p4_alt)
ggsave(p4_alt,
filename = glue("{wkdir}/DVG_figures/deletion.size.bydiet.bytype.pdf"),
width = 10,
height = 5, limitsize=FALSE, useDingbats = FALSE)

lean_index = reps_df %>% filter(type == 'index' & ferret_weight == 'lean') %>%
unique() %>%
group_by(NewGap, segment, NewStart, NewEnd) %>%
add_tally(name = 'lean_deletion_count') %>%
ungroup() %>%
select(NewGap, segment, lean_deletion_count) %>%
unique()
obese_index = reps_df %>% filter(type == 'index' & ferret_weight == 'obese') %>%
unique() %>%
group_by(NewGap, segment, NewStart, NewEnd) %>%
add_tally(name = 'obese_deletion_count') %>%
ungroup() %>%
select(NewGap, segment, obese_deletion_count) %>%
unique()
df = merge(lean_index, obese_index, by = c('NewGap','segment'), all = TRUE)
head(df)
df$lean_deletion_count[is.na(df$lean_deletion_count)] = 0
df$obese_deletion_count[is.na(df$obese_deletion_count)] = 0
p8 = reps_df %>% filter(type == 'index') %>%
unique() %>%
group_by(NewGap, segment, NewStart, NewEnd) %>%
add_tally(name = 'sample_count') %>%
ungroup() %>%
select(NewGap, segment,sample_count) %>%
unique() %>%
ggplot(., aes(x=sample_count, y = ..count../sum(..count..))) +
geom_histogram(color ='black') +
labs(x='number of samples with DVG type', y='proportion of DVGs in dataset (index only)') +
PlotTheme1
print(p8)
ggsave(p8,
filename = glue("{wkdir}/DVG_figures/sample.count.histo.pdf"),
width = 5,
height = 5, limitsize=FALSE, useDingbats = FALSE)
ggsave(p8,
filename = glue("{wkdir}/DVG_figures/sample.count.histo.png"),
width =5,
height = 5, limitsize=FALSE) #, useDingbats = FALSE)

reps_df$ave_dvg_freq = (reps_df$DVG_freq.x + reps_df$DVG_freq.y)/2
reps_df = reps_df %>%
arrange(ferret_day, ave_dvg_freq) %>%
group_by(ferret_day) %>%
mutate(order_number = row_number()) %>%
ungroup() %>%
unique()
reps_df %>%
group_by(NewGap, segment, type, ferret_weight) %>%
mutate(mean_order = mean(order_number),
sample_count = n(),
min_order = min(order_number),
max_order = max(order_number),
median_ord = median(order_number)) %>%
ungroup() %>%
unique() %>%
select(segment, NewGap, mean_order, sample_count, min_order, max_order, median_ord, type, ferret_weight) %>%
filter(sample_count > 1) %>%
unique() %>%
ggplot(., aes(y=mean_order, x = sample_count)) +
geom_point() +
PlotTheme1 +
facet_grid(.~ferret_weight + type)

top_ten = reps_df %>% filter(order_number %in% c(1, 2,3 ,4, 5, 6, 7, 8, 9, 10)) %>% unique()
head(top_ten)
length(levels(factor(top_ten$NewGap)))
[1] 361
max(df$lean_deletion_count)
[1] 19
max(df$obese_deletion_count)
[1] 13
p9 = ggplot(df, aes(x=lean_deletion_count, y = obese_deletion_count)) +
geom_jitter(width = 0.1, height = 0.1, alpha = 0.3) +
geom_hline(yintercept = 0, linetype = 2, color = 'black') +
geom_hline(yintercept = 25, linetype = 2, color = 'red') +
geom_vline(xintercept = 0, linetype = 2, color = 'black') +
geom_vline(xintercept = 17, linetype = 2, color = 'red') +
labs(x= 'number of lean samples with DVG', y='number of obese samples with DVG') +
PlotTheme1
print(p9)
ggsave(p9,
filename = glue("{wkdir}/DVG_figures/sample.count.lean.v.obese.pdf"),
width = 5,
height = 5, limitsize=FALSE, useDingbats = FALSE)
ggsave(p9,
filename = glue("{wkdir}/DVG_figures/sample.count.lean.v.obese.png"),
width =5,
height = 5, limitsize=FALSE) #, useDingbats = FALSE)

reps_df %>% filter(type == 'index' & ferret_weight == 'obese') %>% select(name.x.x) %>% unique() %>% nrow()
[1] 13
reps_df %>% filter(type == 'index' & ferret_weight == 'lean') %>% select(name.x.x) %>% unique() %>% nrow()
[1] 20
richness = rbind(richness, stock_temp)
richness$deletion_richness[is.na(richness$deletion_richness)] = 0
DAYS = c('stock','d02','d04','d06','d08','d10','d12')
richness$cohort = gsub("FCC","Sp19",richness$cohort)
richness$dpi = factor(richness$dpi, levels = DAYS)
richness %>% filter(dpi %in% c('d02','d04')) %>%
filter(type == 'index' | type == 'stock') %>%
select(ferret_id, dpi, deletion_richness, type, ferret_weight, index_direct, cohort) %>%
unique() %>%
group_by(ferret_id) %>%
add_tally(name = 'n') %>%
ungroup() %>%
filter(n >= 2) %>%
ungroup() %>%
unique() %>%
ggplot(., aes(x=dpi, y = deletion_richness, color = cohort, group=ferret_id, shape = ferret_weight)) +
#geom_boxplot() +
geom_line() +
geom_point(size = 2) +
PlotTheme1 +
scale_color_brewer(palette = 'Set1')

#weight_colScale +
#facet_grid(.~cohort)
p7 = richness %>% filter(dpi %in% c('d02','d04','d06')) %>%
filter(type == 'index' | type == 'stock') %>%
select(ferret_id, dpi, deletion_richness, type, ferret_weight, index_direct, cohort) %>%
unique() %>%
group_by(ferret_id) %>%
add_tally(name = 'n') %>%
ungroup() %>%
filter(n >= 2) %>%
ungroup() %>%
unique() %>%
ggplot(., aes(x=dpi, y = deletion_richness, color = ferret_weight, group=ferret_id, shape = ferret_weight)) +
#geom_boxplot() +
geom_line() +
geom_point(size = 2) +
PlotTheme1 +
weight_colScale
#facet_grid(.~cohort)
print(p7)
ggsave(p7,
filename = glue("{wkdir}/DVG_figures/richness.index.pdf"),
width = 5,
height = 5, limitsize=FALSE, useDingbats = FALSE)
ggsave(p7,
filename = glue("{wkdir}/DVG_figures/richness.index.png"),
width =5,
height = 5, limitsize=FALSE) #, useDingbats = FALSE)

colnames(richness)
[1] "ferret_id" "dpi" "cohort" "ferret_day" "segment"
[6] "deletion_richness" "seg_deletion_richness" "pair" "name.x" "rep.x"
[11] "name.y" "rep.y" "ferret_weight" "type" "flag"
[16] "chain_position" "notes" "index_direct" "worked" "pair_ids"
order_typeday = c('stock','index_d02','index_d04',
'index_d06','index_d08','index_d10',
'index_d12',
'direct_d02','direct_d04',
'direct_d06','direct_d08','direct_d10',
'direct_d12')
richness$type_day = paste0(richness$type, '_', richness$dpi)
richness$type_day = factor(richness$type_day, levels = order_typeday)
p2 = richness %>% filter(ferret_weight == 'obese' & index_direct == 'obese_obese') %>%
select(ferret_id, dpi, deletion_richness, type, ferret_weight,
pair_ids, index_direct, type_day) %>%
ungroup() %>%
unique() %>%
ggplot(., aes(x=type_day, y = deletion_richness, color = pair_ids, group=pair_ids)) +
#geom_boxplot() +
geom_line(size = 1) +
geom_point(size = 2) +
labs(x='dpi (by index case)', y='DVG richness') +
PlotTheme1 +
scale_color_brewer(palette = 'Set2') #+
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
Please use `linewidth` instead.
#weight_colScale +
#facet_grid(.~type)
print(p2)
ggsave(p2,
filename = glue("{wkdir}/DVG_figures/obese.to.obese.diversity.pdf"),
width = 8,
height = 6, limitsize=FALSE, useDingbats = FALSE)
ggsave(p2,
filename = glue("{wkdir}/DVG_figures/obese.to.obese.diversity.png"),
width =8,
height = 6, limitsize=FALSE) #, useDingbats = FALSE)

gen_rich = richness %>%
select(ferret_id, dpi, cohort,deletion_richness, type, ferret_weight, pair_ids, index_direct, type_day) %>%
unique()
head(gen_rich)
gen_rich %>% filter(dpi %in% c('d02','d04','d06','stock')) %>%
filter(type == 'index' | type == "stock") %>%
ggplot(., aes(x=ferret_weight, y = deletion_richness, group = ferret_weight)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, aes(color = ferret_weight)) +
labs(x='segment',y='deletion richness') +
PlotTheme1 +
weight_colScale +
facet_grid(.~dpi)

Test for significance
o = filter(gen_rich, type == "index" & dpi == "d06" & ferret_weight == "obese")
l = filter(gen_rich, type == "index" & dpi == "d06" & ferret_weight == "lean")
t.test(o$deletion_richness,l$deletion_richness)
Welch Two Sample t-test
data: o$deletion_richness and l$deletion_richness
t = -0.91173, df = 4.5571, p-value = 0.4076
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-106.18666 51.78666
sample estimates:
mean of x mean of y
34.0 61.2
seg_rich = richness %>% #filter(ferret_weight == 'obese' & index_direct == 'obese_obese') %>%
select(ferret_id, dpi, seg_deletion_richness, type, ferret_weight,
pair_ids, index_direct, type_day, segment) %>%
unique()
head(seg_rich)
temp = seg_rich %>% select(ferret_id, dpi, type, ferret_weight, pair_ids, type_day, index_direct) %>% unique()
temp$H9N2_PB2 = 0
temp$H9N2_PB1 = 0
temp$H9N2_PA = 0
temp$H9N2_HA = 0
temp$H9N2_NP = 0
temp$H9N2_NA = 0
temp$H9N2_MP = 0
temp$H9N2_NS = 0
temp = pivot_longer(temp, cols = all_of(SEGMENTS), names_to = 'segment') %>% select(-value)
seg_rich = merge(seg_rich, temp, by = c('ferret_id', 'dpi', 'type', 'ferret_weight',
'pair_ids', 'index_direct', 'type_day', 'segment'), all = TRUE)
seg_rich$seg_deletion_richness[is.na(seg_rich$seg_deletion_richness)] = 0
seg_rich$segment = factor(seg_rich$segment, levels = SEGMENTS)
head(seg_rich)
p3 = seg_rich %>% filter(segment %in% SEGMENTS) %>%
ggplot(., aes(x=segment, y = seg_deletion_richness)) +
geom_boxplot() +
labs(x='segment',y='deletion richness') +
PlotTheme1
print(p3)
ggsave(p3,
filename = glue("{wkdir}/DVG_figures/segment.richness.pdf"),
width = 5,
height = 5, limitsize=FALSE, useDingbats = FALSE)
ggsave(p3,
filename = glue("{wkdir}/DVG_figures/segment.richness.png"),
width =5,
height = 5, limitsize=FALSE) #, useDingbats = FALSE)

seg_rich$seg_weight = paste0(seg_rich$segment, '_', seg_rich$ferret_weight)
seg_rich$ferret_weight = factor(seg_rich$ferret_weight, levels = c('stock','lean','obese'))
seg_rich %>% filter(segment %in% SEGMENTS) %>%
ggplot(., aes(x=segment, y = seg_deletion_richness, group = seg_weight, color = ferret_weight)) +
geom_boxplot() +
labs(x='segment',y='deletion richness') +
PlotTheme1 +
weight_colScale +
facet_grid(.~type)

p6 = seg_rich %>% filter(segment %in% SEGMENTS & dpi %in% c('d02','d04','d06','stock')) %>%
filter(type == 'index' | type == "stock") %>%
ggplot(., aes(x=segment, y = seg_deletion_richness, group = seg_weight, color = ferret_weight)) +
geom_boxplot(outlier.shape = NA) +
labs(x='segment',y='deletion richness') +
PlotTheme1 +
weight_colScale +
facet_grid(.~dpi)
print(p6)
ggsave(p6,
filename = glue("{wkdir}/DVG_figures/segment.index.richness.pdf"),
width = 8,
height = 4, limitsize=FALSE, useDingbats = FALSE)
ggsave(p6,
filename = glue("{wkdir}/DVG_figures/segment.index.richness.png"),
width =8,
height = 4, limitsize=FALSE) #, useDingbats = FALSE)

NA
NA
Test for significance
o = filter(seg_rich, type == "index" & dpi == "d02" & segment == "H9N2_PB2" & ferret_weight == "obese")
l = filter(seg_rich, type == "index" & dpi == "d02" & segment == "H9N2_PB2" & ferret_weight == "lean")
t.test(o$seg_deletion_richness,l$seg_deletion_richness)
Welch Two Sample t-test
data: o$seg_deletion_richness and l$seg_deletion_richness
t = -1.9934, df = 10.943, p-value = 0.07174
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-21.348926 1.063212
sample estimates:
mean of x mean of y
10.00000 20.14286
seg_rich %>% filter(type == 'index' & segment %in% SEGMENTS) %>%
ungroup() %>%
unique() %>%
ggplot(., aes(x=type_day, y = seg_deletion_richness, color = segment, group=segment)) +
#geom_boxplot() +
geom_line(size = 1) +
geom_point(size = 2) +
labs(x='dpi (by index case)', y='DVG richness') +
PlotTheme1 +
scale_color_brewer(palette = 'Set2') +
#weight_colScale +
facet_grid(.~ferret_weight + ferret_id, scales = 'free', space = 'free')

p4 = seg_rich %>% filter(ferret_weight == 'obese' & index_direct == 'obese_obese' & segment %in% SEGMENTS) %>%
ungroup() %>%
unique() %>%
ggplot(., aes(x=type_day, y = seg_deletion_richness, color = segment, group=segment)) +
#geom_boxplot() +
geom_line(size = 1) +
geom_point(size = 2) +
labs(x='dpi (by index case)', y='DVG richness') +
PlotTheme1 +
scale_color_brewer(palette = 'Set2') +
#weight_colScale +
facet_grid(.~pair_ids, scales = 'free', space = 'free')
print(p4)
ggsave(p4,
filename = glue("{wkdir}/DVG_figures/segment.obese.to.obese.richness.pdf"),
width = 12,
height = 4, limitsize=FALSE, useDingbats = FALSE)
ggsave(p4,
filename = glue("{wkdir}/DVG_figures/segment.obese.to.obese.richness.png"),
width =12,
height = 4, limitsize=FALSE) #, useDingbats = FALSE)

End of Kate’s code
Which DVGs are shared between stock and index?
reps_df$DVG = paste0(reps_df$segment,"_",reps_df$DVG_group)
F17_stock = filter(reps_df ,type == "stock", cohort == "F17")
F17_stock_dvg = unique(F17_stock$DVG)
W17_stock = filter(reps_df ,type == "stock", cohort == "W17")
W17_stock_dvg = unique(W17_stock$DVG)
Sm18_stock = filter(reps_df ,type == "stock", cohort == "Sm18")
Sm18_stock_dvg = unique(Sm18_stock$DVG)
Sp19_stock = filter(reps_df ,type == "stock", cohort == "Sp19")
Sp19_stock_dvg = unique(Sp19_stock$DVG)
Sp20_stock = filter(reps_df ,type == "stock", cohort == "Sp20")
Sp20_stock_dvg = unique(Sp20_stock$DVG)
F17_index = filter(reps_df ,type == "index", cohort == "F17")
F17_index_dvg = unique(F17_index$DVG)
W17_index = filter(reps_df ,type == "index", cohort == "W17")
W17_index_dvg = unique(W17_index$DVG)
Sm18_index = filter(reps_df ,type == "index", cohort == "Sm18")
Sm18_index_dvg = unique(Sm18_index$DVG)
Sp19_index = filter(reps_df ,type == "index", cohort == "Sp19")
Sp19_index_dvg = unique(Sp19_index$DVG)
Sp20_index = filter(reps_df ,type == "index", cohort == "Sp20")
Sp20_index_dvg = unique(Sp20_index$DVG)
F17_shared = F17_index %>% filter(DVG %in% F17_stock_dvg) %>% filter((DVG %in% F17_index_dvg)) %>% unique()
F17_denovo = F17_index %>% filter((DVG %in% F17_index_dvg)) %>% filter(!(DVG %in% F17_stock_dvg)) %>% unique()
W17_shared = W17_index %>% filter(DVG %in% W17_stock_dvg) %>% filter((DVG %in% W17_index_dvg)) %>% unique()
W17_denovo = W17_index %>% filter((DVG %in% W17_index_dvg)) %>% filter(!(DVG %in% W17_stock_dvg)) %>% unique()
Sm18_shared = Sm18_index %>% filter(DVG %in% Sm18_stock_dvg) %>% filter((DVG %in% Sm18_index_dvg)) %>% unique()
Sm18_denovo = Sm18_index %>% filter((DVG %in% Sm18_index_dvg)) %>% filter(!(DVG %in% Sm18_stock_dvg)) %>% unique()
Sp19_shared = Sp19_index %>% filter(DVG %in% Sp19_stock_dvg) %>% filter((DVG %in% Sp19_index_dvg)) %>% unique()
Sp19_denovo = Sp19_index %>% filter((DVG %in% Sp19_index_dvg)) %>% filter(!(DVG %in% Sp19_stock_dvg)) %>% unique()
Sp20_shared = Sp20_index %>% filter(DVG %in% Sp20_stock_dvg) %>% filter((DVG %in% Sp20_index_dvg)) %>% unique() # still not working
Sp20_denovo = Sp20_index %>% filter((DVG %in% Sp20_index_dvg)) %>% filter(!(DVG %in% Sp20_stock_dvg)) %>% unique() # still not working
stock_shared = rbind(F17_shared,W17_shared,Sm18_shared,Sp19_shared,Sp20_shared)
index_unique = rbind(F17_denovo,W17_denovo,Sm18_denovo,Sp19_denovo,Sp20_denovo)
stock_obese = filter(stock_shared, ferret_weight == "obese")
o_dvg = unique(stock_obese$DVG)
stock_lean = filter(stock_shared, ferret_weight == "lean")
l_dvg = unique(stock_lean$DVG)
stock_dvg <- list(Obese = o_dvg, Lean = l_dvg)
StockDVGs = ggVennDiagram(stock_dvg)
print(StockDVGs)
ggsave(StockDVGs, file = "StockDVGs.pdf", path = saveitdir)
Saving 7.29 x 4.51 in image

ShockSharedDVGs = ggplot(stock_shared, aes(x = dpi, y = DVG)) +
geom_point() +
geom_line(aes(group = DVG)) +
facet_grid(~segment) +
PlotTheme1
print(ShockSharedDVGs)
ggsave(ShockSharedDVGs, file = "ShockSharedDVGs.pdf", path = saveitdir)
Saving 7.29 x 4.51 in image

Are there diet-specific DVGs in index ferrets?
index_obese = filter(index_unique, ferret_weight == "obese")
o_dvg = unique(index_obese$DVG)
index_lean = filter(index_unique, ferret_weight == "lean")
l_dvg = unique(index_lean$DVG)
diet_dvg <- list(Obese = o_dvg, Lean = l_dvg)
DietUniqueDVGs = ggVennDiagram(diet_dvg)
print(DietUniqueDVGs)
ggsave(DietUniqueDVGs, file = "DietUniqueDVGs.pdf", path = saveitdir)
Saving 7.29 x 4.51 in image

Pulling out diet-specific DVGs
lean = index_lean %>%
filter(DVG %in% l_dvg) %>%
filter(!(DVG %in% o_dvg)) %>%
unique()
lean = lean %>%
group_by(DVG) %>%
mutate(count = 1, totalsamp = sum(count))
mult_lean = filter(lean, totalsamp > 1) %>%
unique()
obese = index_unique %>%
filter((DVG %in% o_dvg)) %>%
filter(!(DVG %in% l_dvg)) %>%
unique()
obese = obese %>%
group_by(DVG) %>%
mutate(count = 1, totalsamp = sum(count))
mult_obese = filter(obese, totalsamp > 1) %>%
unique()
lean_uniques = lean %>%
ungroup() %>%
select(segment,DVG_group,GroupBoundaries,totalsamp) %>%
unique() %>%
arrange(desc(totalsamp))
print(lean_uniques)
obese_uniques = obese %>%
ungroup() %>%
select(segment,DVG_group,GroupBoundaries,totalsamp) %>%
unique() %>%
arrange(desc(totalsamp))
print(obese_uniques)
Are DVGs transmitted?
dvg_df = select(reps_df, ferret_day, ferret_id, dpi, ferret_weight, type, segment, DVG_group, GroupBoundaries, pair, index_direct, pair_ids) %>%
ungroup() %>%
unique()
dvg_df$seg_dvg = paste0(dvg_df$segment,"_",dvg_df$DVG_group)
index = filter(dvg_df, type == "index") %>% unique()
first_time = c("1794_d04","1797_d02","1913_d06","1914_d06","1980_d02","1981_d10","1986_d10","2231_d06","2232_d02","2239_d02")
early_time = c("1794_d04","1797_d02","1980_d02","2232_d02","2239_d02")
direct = filter(dvg_df, ferret_day %in% first_time) %>% unique()
samples = unique(index$ferret_day)
dvg_transmitted = data.frame()
for(i in samples){
print(i)
n = filter(index, ferret_day == i)
partner = unique(n$pair_ids)
d = filter(direct, pair_ids %in% partner)
if(nrow(d) > 0){
s = unique(d$ferret_day)
print(s)
comp = merge(n, d, by = c("index_direct","pair_ids","pair","segment","seg_dvg","DVG_group","GroupBoundaries"), all.x = TRUE) %>%
count(pair_ids,ferret_day.x, ferret_day.y)
colnames(comp) = c("pair_ids","index","contact","count")
dvg_transmitted = rbind(dvg_transmitted, comp)
}else(print("No transmission"))
}
#dvg_transmitted = dvg_transmitted %>%
# pivot_wider(names_from = contact, values_from = count)
contact_dvgs = filter(dvg_df, type == "direct") %>% count(ferret_id,dpi,ferret_weight) %>%
ggplot(., aes(x = dpi, y = n, color = ferret_weight)) +
geom_point() +
geom_line(aes(group = ferret_id)) +
facet_grid(~ferret_id) +
ylab("DVG richness") +
xlab("DPI") +
PlotTheme1 +
weight_colScale
print(contact_dvgs)
ggsave(contact_dvgs, file = "contact_dvgs.pdf", path = saveitdir)
Saving 7.29 x 4.51 in image

---
title: "DVG_Analysis"
output: html_notebook
---

Kate's DVG code

```{r}
message("Loading packages")
library('ggplot2')
library('readr')
library('reshape2')
library('ggpubr')
library('plyr')
library('tidyverse')
library('dplyr')
library('glue')
library('ggVennDiagram')
```

```{r}
# paramters used when running divrge
grouping_param = 5
match_length_param = 28
readLength = 150

# deletion read count cutoffs
count_cut = 30

# only looking at index and direct cases
keeping = c('index','direct')
```

```{r}
message("Setting work directory and input file names")
wkdir = "/Users/marissaknoll/Desktop/GitHub/Obesity/NewExtractions/H9N2/DVGs"
setwd(wkdir)
```

```{r}
if (!dir.exists(glue("{wkdir}/DVG_figures"))) {
        dir.create(glue("{wkdir}/DVG_figures"))
      }

saveitdir = glue("{wkdir}/DVG_figures")
```

```{r}
source(glue('{wkdir}/scripts/obese_PlotPrep.R'))
```

```{r}
# loading in metadata and coverage data
metafile = glue("{wkdir}/scripts/transmission_h9n2_use_long.csv")
meta = read.csv(file=metafile,header=T,sep=",",na.strings = c('')) 
meta = filter(meta, flag == "keep")
coverage_passfile = glue('{wkdir}/scripts/H9N2.coverage.pass.check.200.0.95.csv')
cov_check = read.csv(file=coverage_passfile,header=T,sep=",",na.strings = c(''))
```

```{r}
# filter for samples that either pass with a yes OR has good average coverage and percentage cov at 200x is > 80
cov_filt_names = cov_check %>% filter(pass == 'YES' | 
                                      mean_coverage >= 200  | 
                                      percentage > 0.7) %>% 
            select(name, segment) %>% 
            unique()

cov_filt_names = filter(cov_filt_names, name != "2244_d12") %>% unique()

# check segment count
cov_filt_names = cov_filt_names %>% group_by(name) %>% add_tally(name = 'segment_tally') %>% 
                    ungroup() %>% 
                    filter(segment_tally == 8) %>% 
                    unique() 

pull_names = c(levels(factor(cov_filt_names$name)))  # list to pull names from
```

```{r}
dvgfile = glue('{wkdir}/H9N2.DVG.FINAL.OneGap.N5.Mis2.M28.G5.csv') # dvg file
dvg = read.csv(file=dvgfile,header=T,sep=",",na.strings = c(''))

dvg = dvg %>% filter(name %in% pull_names) # filter for samples that pass our coverage checks
dvg$sample = dvg$name  # generate new column so we can separate
dvg = dvg %>% separate(sample, c('new','cohort','ferret_id','dpi','rep'), '_')  # separate into info

CONTROLS = dvg %>% filter(ferret_id == 'HK1073')  # pulling out controls
CONTROLS$rep = CONTROLS$dpi
CONTROLS$dpi = 'stock'  # adding in stock info

dvg = dvg %>% filter(!name %in% c(levels(factor(CONTROLS$name)))) %>% unique()

dvg = rbind(dvg, CONTROLS) # rbind everything so it is all in one dataframe
```

```{r}
# prepping rep information
dvg = dvg %>% select(-SegTotalDVG) %>% filter(DVG_freq >= count_cut) %>% unique()  # filter for those that pass cutoffs
rep1 = dvg %>% filter(rep == 'rep1') %>% unique()
rep2 = dvg %>% filter(rep == 'rep2') %>% unique()
```

```{r}
# merge reps into one
dvg_reps = merge(rep1, rep2, by = c('cohort','ferret_id','dpi',
                                   'segment','segment_size','strain',
                                   'DVG_group','NewGap',
                                   'NewStart','NewEnd','GroupBoundaries',
                                   'DeletionSize','EstimatedFragLength'), all = TRUE) %>% unique()
```

```{r}
# add in zeros
dvg_reps$DVG_freq.x[is.na(dvg_reps$DVG_freq.x)] = 0
dvg_reps$DVG_freq.y[is.na(dvg_reps$DVG_freq.y)] = 0
```

```{r}
ggplot(dvg_reps, aes(x=DVG_freq.x, y=DVG_freq.y)) + 
    geom_point() + 
    PlotTheme1
```

```{r}
# number of samples?
levels(factor(dvg_reps$ferret_id)) %>% length()
```

```{r}
# reorder by segment size
SEGMENTS = c('H9N2_PB2', 'H9N2_PB1',
            'H9N2_PA','H9N2_HA','H9N2_NP', 
            'H9N2_NA','H9N2_MP','H9N2_NS')

cov_check$segment = factor(cov_check$segment, levels = SEGMENTS)
```

```{r}
cov_check %>% 
    filter(name %in% pull_names) %>%
    ggplot(., aes(x= segment, y = mean_coverage)) + 
    geom_boxplot() + 
    PlotTheme1

cov_check %>% 
    filter(name %in% pull_names) %>%
    ggplot(., aes(x= segment, y = median_coverage)) + 
    geom_boxplot() + 
    PlotTheme1

cov_check %>% 
    filter(name %in% pull_names) %>%
    ggplot(., aes(x= segment, y = percentage)) + 
    geom_boxplot() + 
    PlotTheme1
```

```{r}
df = cov_filt_names %>% select(-segment, -segment_tally) %>% unique()

df$sample = df$name

df = df %>% separate(sample, c('new','cohort','ferret_id','dpi','rep'), '_')

CONTROLS = df %>% filter(ferret_id == 'HK1073')

CONTROLS$rep = CONTROLS$dpi

CONTROLS$dpi = 'stock'


df = df %>% filter(!name %in% c(levels(factor(CONTROLS$name)))) %>% unique()

df = rbind(df, CONTROLS)

r1 = df %>% filter(rep == 'rep1') %>% select(-new) %>% unique()
r2 = df %>% filter(rep == 'rep2') %>% select(-new) %>% unique()
reps = merge(r1, r2, by = c('cohort','ferret_id','dpi')) %>% unique()


# these are the samples that only had one rep!
setdiff(levels(factor(r1$ferret_id)),
       levels(factor(r2$ferret_id)))
```

```{r}
setdiff(meta$ferret_id, reps$ferret_id)  # samples in meta not in seq data
setdiff(reps$ferret_id, meta$ferret_id) # samples in seq data not in meta

m = merge(reps, meta, by = c('ferret_id'), all = TRUE)

write.csv(m, glue('{wkdir}/scripts/UPDATED.H9N2.metadata.csv'), row.names = FALSE)
```

```{r}
temp = meta %>% 
        filter(type %in% c('index','direct')) %>% 
        unique() %>% 
        select(pair, type, ferret_id) %>% unique() %>% 
        pivot_wider(names_from = 'type', values_from = 'ferret_id')

temp = rbind(temp, c('Z','stock','stock'))

temp$pair_ids = paste0(temp$index,'>',temp$direct)

temp = temp  %>% select(pair, pair_ids) %>% unique()

m = merge(m, temp, by = c('pair'))  # add in additional information to metadata to work with
```

```{r}
# type check - only stock index direct
print(levels(factor(m$type)))
```

```{r}
m$type = factor(m$type, levels = c('stock','index','direct'))

m = m %>% filter(name.x != is.na(name.x)) %>% unique()
```

```{r}
p1 = m %>% filter(ferret_id != 'HK1073') %>% unique() %>% 
    ggplot(., aes(x= dpi, y = pair_ids, fill = ferret_weight)) + 
    geom_tile(color = 'black') + 
    PlotTheme3 +
    weight_colFill + 
    facet_grid(index_direct~type, scales = 'free', space = 'free')

print(p1)

ggsave(p1,
       filename = glue("{wkdir}/DVG_figures/final.samples.pdf"),
       width = 6,
       height = 6, limitsize=FALSE, useDingbats = FALSE)

ggsave(p1,
       filename = glue("{wkdir}/DVG_figures/final.samples.png"),
       width = 6,
       height = 6, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
dvg_reps = dvg_reps %>% 
            filter(DVG_freq.x > count_cut & DVG_freq.y > count_cut) %>% 
        unique()   # make sure that both reps pass our cutoff

# add in variables for plotting
dvg_reps$ferret_day = paste0(dvg_reps$ferret_id, '_', dvg_reps$dpi)  

m$ferret_day = paste0(m$ferret_id, '_', m$dpi)
```

```{r}
stock_temp = dvg_reps %>% filter(dpi == 'stock') %>%
    group_by(ferret_id, cohort, dpi, segment, name.x, name.y) %>%
    add_tally(name = 'seg_deletion_richness') %>%
    unique() %>%
    ungroup() %>% 
    group_by(ferret_id, dpi, name.x, name.y, cohort) %>% 
    add_tally(name = 'deletion_richness') %>%
    ungroup() %>% 
    unique()

s = stock_temp  # will use later

# filter down stock temp information
stock_temp = stock_temp %>% 
            select(ferret_id, dpi, cohort,ferret_day, segment, deletion_richness, seg_deletion_richness) %>%
            unique()


stock_temp = merge(stock_temp, m, by = c('ferret_id', 'dpi','cohort','ferret_day')) %>% 
    unique()
```

```{r}
# filter out stock information, calculate dvg richness by segment and across genome for samples

dr = dvg_reps %>% 
            filter(dpi != 'stock') %>% 
            unique() %>% 
            group_by(ferret_id, dpi, segment, name.x, name.y, cohort) %>%
            add_tally(name = 'seg_deletion_richness') %>%
            ungroup() %>% 
            group_by(ferret_id, dpi, name.x, name.y, cohort) %>% 
            add_tally(name = 'deletion_richness') %>%
            ungroup() %>% 
            unique()

# filter down information so you don't have duplicates
richness = dr %>% 
            select(ferret_id, dpi, cohort,ferret_day, segment, deletion_richness, seg_deletion_richness) %>%
            unique()

# merge with metadata info
richness = merge(richness, m, by = c('ferret_id', 'dpi','cohort','ferret_day'), all.y = TRUE) %>%
    unique()

# make sure we filter out stock information (will add using the 's' dataframe generated above)
richness = richness %>% filter(dpi != 'stock')

reps_df = rbind(dr, s) # final reps richness df

reps_df = merge(reps_df, m, by = c('ferret_id','dpi','cohort','ferret_day'))  # add metadata
```

```{r}
p4 = reps_df %>% 
    select(segment, NewGap, EstimatedFragLength, ferret_weight) %>%
    unique() %>%
    ggplot(., aes(x= EstimatedFragLength)) + 
    geom_histogram(color = 'black') + 
    PlotTheme1 +
    labs(x="estimated DVG frag. length (nt)", y='number of unique DVG species') 
print(p4)

ggsave(p4,
       filename = glue("{wkdir}/DVG_figures/deletion.size.pdf"),
       width = 5,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

ggsave(p4,
       filename = glue("{wkdir}/DVG_figures/deletion.size.png"),
       width =5,
       height = 5, limitsize=FALSE) #, useDingbats = FALSE)

p4_alt = reps_df %>% 
    select(segment, NewGap, EstimatedFragLength, ferret_weight, type) %>%
    unique() %>%
    ggplot(., aes(x= EstimatedFragLength)) + 
    geom_histogram(color = 'black', binwidth = 50) + 
    facet_grid(type~ferret_weight) +
    PlotTheme1 +
    labs(x="estimated DVG frag. length (nt)", y='number of unique DVG species') 
print(p4_alt)
ggsave(p4_alt,
       filename = glue("{wkdir}/DVG_figures/deletion.size.bydiet.bytype.pdf"),
       width = 10,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

```

```{r}
lean_index =  reps_df %>% filter(type == 'index' & ferret_weight == 'lean') %>% 
                unique() %>%
                group_by(NewGap, segment, NewStart, NewEnd) %>%
            add_tally(name = 'lean_deletion_count') %>%
    ungroup() %>%
    select(NewGap, segment, lean_deletion_count) %>%
    unique()


obese_index =  reps_df %>% filter(type == 'index' & ferret_weight == 'obese') %>% 
                unique() %>%
                group_by(NewGap, segment, NewStart, NewEnd) %>%
            add_tally(name = 'obese_deletion_count') %>%
    ungroup() %>%
    select(NewGap, segment, obese_deletion_count) %>%
    unique()


df = merge(lean_index, obese_index, by = c('NewGap','segment'), all = TRUE) 

head(df)

df$lean_deletion_count[is.na(df$lean_deletion_count)] = 0

df$obese_deletion_count[is.na(df$obese_deletion_count)] = 0
```

```{r}
p8 = reps_df %>% filter(type == 'index') %>% 
                unique() %>%
                group_by(NewGap, segment, NewStart, NewEnd) %>%
            add_tally(name = 'sample_count') %>%
    ungroup() %>%
    select(NewGap, segment,sample_count) %>%
    unique()  %>%
    ggplot(., aes(x=sample_count, y = ..count../sum(..count..))) +
    geom_histogram(color ='black') + 
    labs(x='number of samples with DVG type', y='proportion of DVGs in dataset (index only)') + 
    PlotTheme1 

print(p8)
ggsave(p8,
       filename = glue("{wkdir}/DVG_figures/sample.count.histo.pdf"),
       width = 5,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

ggsave(p8,
       filename = glue("{wkdir}/DVG_figures/sample.count.histo.png"),
       width =5,
       height = 5, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
reps_df$ave_dvg_freq = (reps_df$DVG_freq.x + reps_df$DVG_freq.y)/2
```

```{r}
reps_df = reps_df %>%
  arrange(ferret_day, ave_dvg_freq) %>% 
  group_by(ferret_day) %>% 
  mutate(order_number = row_number()) %>% 
  ungroup() %>%
  unique()
```

```{r}
reps_df %>% 
    group_by(NewGap, segment, type, ferret_weight) %>%
    mutate(mean_order = mean(order_number),
          sample_count = n(),
          min_order = min(order_number),
          max_order = max(order_number),
          median_ord = median(order_number)) %>%
    ungroup() %>%
    unique() %>%
    select(segment, NewGap, mean_order, sample_count, min_order, max_order, median_ord, type, ferret_weight) %>%
    filter(sample_count > 1) %>%
    unique() %>%
    ggplot(., aes(y=mean_order, x = sample_count)) + 
    geom_point() + 
    PlotTheme1 + 
    facet_grid(.~ferret_weight + type)
```

```{r}
top_ten = reps_df %>% filter(order_number %in% c(1, 2,3 ,4, 5, 6, 7, 8, 9, 10)) %>% unique()

head(top_ten)

length(levels(factor(top_ten$NewGap)))
```

```{r}
max(df$lean_deletion_count)
max(df$obese_deletion_count)

p9 = ggplot(df, aes(x=lean_deletion_count, y = obese_deletion_count)) + 
    geom_jitter(width = 0.1, height = 0.1, alpha = 0.3) + 
    geom_hline(yintercept = 0, linetype = 2, color = 'black') +
    geom_hline(yintercept = 25, linetype = 2, color = 'red') +
    geom_vline(xintercept = 0, linetype = 2, color = 'black') +
    geom_vline(xintercept = 17, linetype = 2, color = 'red') +
    labs(x= 'number of lean samples with DVG', y='number of obese samples with DVG') + 
    PlotTheme1 

print(p9)
ggsave(p9,
       filename = glue("{wkdir}/DVG_figures/sample.count.lean.v.obese.pdf"),
       width = 5,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

ggsave(p9,
       filename = glue("{wkdir}/DVG_figures/sample.count.lean.v.obese.png"),
       width =5,
       height = 5, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
reps_df %>% filter(type == 'index' & ferret_weight == 'obese') %>% select(name.x.x) %>% unique() %>% nrow()
reps_df %>% filter(type == 'index' & ferret_weight == 'lean') %>% select(name.x.x) %>% unique() %>% nrow()
```

```{r}
richness = rbind(richness, stock_temp)
richness$deletion_richness[is.na(richness$deletion_richness)] = 0
DAYS = c('stock','d02','d04','d06','d08','d10','d12')
```

```{r}
richness$cohort = gsub("FCC","Sp19",richness$cohort)
richness$dpi = factor(richness$dpi, levels = DAYS)
richness %>% filter(dpi %in% c('d02','d04')) %>%
        filter(type == 'index' | type == 'stock') %>%
    select(ferret_id, dpi, deletion_richness, type, ferret_weight, index_direct, cohort) %>%
    unique() %>%
    group_by(ferret_id) %>%
    add_tally(name = 'n') %>%
    ungroup() %>%
    filter(n >= 2) %>% 
    ungroup() %>%
    unique() %>%
    ggplot(., aes(x=dpi, y = deletion_richness, color = cohort, group=ferret_id, shape = ferret_weight)) + 
    #geom_boxplot() + 
    geom_line() + 
    geom_point(size = 2) + 
    PlotTheme1 +
    scale_color_brewer(palette = 'Set1')
    #weight_colScale + 
    #facet_grid(.~cohort)


p7 = richness %>% filter(dpi %in% c('d02','d04','d06')) %>%
        filter(type == 'index' | type == 'stock') %>%    
    select(ferret_id, dpi, deletion_richness, type, ferret_weight, index_direct, cohort) %>%
    unique() %>%
    group_by(ferret_id) %>%
    add_tally(name = 'n') %>%
    ungroup() %>%
    filter(n >= 2) %>% 
    ungroup() %>%
    unique() %>%
    ggplot(., aes(x=dpi, y = deletion_richness, color = ferret_weight, group=ferret_id, shape = ferret_weight)) + 
    #geom_boxplot() + 
    geom_line() + 
    geom_point(size = 2) + 
    PlotTheme1 +
    weight_colScale
    #facet_grid(.~cohort)

print(p7)
ggsave(p7,
       filename = glue("{wkdir}/DVG_figures/richness.index.pdf"),
       width = 5,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

ggsave(p7,
       filename = glue("{wkdir}/DVG_figures/richness.index.png"),
       width =5,
       height = 5, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
colnames(richness)
order_typeday = c('stock','index_d02','index_d04',
                 'index_d06','index_d08','index_d10',
                 'index_d12',
                 'direct_d02','direct_d04',
                 'direct_d06','direct_d08','direct_d10',
                 'direct_d12')
```

```{r}
richness$type_day = paste0(richness$type, '_', richness$dpi)
richness$type_day = factor(richness$type_day, levels = order_typeday)

p2 = richness %>% filter(ferret_weight == 'obese' & index_direct == 'obese_obese') %>%
    select(ferret_id, dpi, deletion_richness, type, ferret_weight, 
           pair_ids, index_direct, type_day) %>%
    ungroup() %>%
    unique() %>%
    ggplot(., aes(x=type_day, y = deletion_richness, color = pair_ids, group=pair_ids)) + 
    #geom_boxplot() + 
    geom_line(size = 1) + 
    geom_point(size = 2) + 
    labs(x='dpi (by index case)', y='DVG richness') + 
    PlotTheme1 +
    scale_color_brewer(palette = 'Set2') #+
    #weight_colScale + 
    #facet_grid(.~type)

print(p2)
ggsave(p2,
       filename = glue("{wkdir}/DVG_figures/obese.to.obese.diversity.pdf"),
       width = 8,
       height = 6, limitsize=FALSE, useDingbats = FALSE)

ggsave(p2,
       filename = glue("{wkdir}/DVG_figures/obese.to.obese.diversity.png"),
       width =8,
       height = 6, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
gen_rich = richness %>% 
  select(ferret_id, dpi, cohort,deletion_richness, type, ferret_weight, pair_ids, index_direct, type_day) %>%
  unique()

head(gen_rich)

gen_rich %>% filter(dpi %in% c('d02','d04','d06','stock')) %>%
    filter(type == 'index' | type == "stock") %>%
    ggplot(., aes(x=ferret_weight, y = deletion_richness, group = ferret_weight)) + 
    geom_boxplot(outlier.shape = NA) + 
    geom_jitter(width = 0.2, aes(color = ferret_weight)) +
    labs(x='segment',y='deletion richness') + 
    PlotTheme1 +
    weight_colScale +
    facet_grid(.~dpi)
```

Test for significance
```{r}
o = filter(gen_rich, type == "index" & dpi == "d06" & ferret_weight == "obese")
l = filter(gen_rich, type == "index" & dpi == "d06" & ferret_weight == "lean")
t.test(o$deletion_richness,l$deletion_richness)
```

```{r}
seg_rich = richness %>% #filter(ferret_weight == 'obese' & index_direct == 'obese_obese') %>%
    select(ferret_id, dpi, seg_deletion_richness, type, ferret_weight, 
           pair_ids, index_direct, type_day, segment) %>%
    unique()

head(seg_rich)

temp = seg_rich %>% select(ferret_id, dpi, type, ferret_weight, pair_ids, type_day, index_direct) %>% unique()
temp$H9N2_PB2 = 0
temp$H9N2_PB1 = 0
temp$H9N2_PA = 0
temp$H9N2_HA = 0
temp$H9N2_NP = 0
temp$H9N2_NA = 0
temp$H9N2_MP = 0
temp$H9N2_NS = 0

temp = pivot_longer(temp, cols = all_of(SEGMENTS), names_to = 'segment') %>% select(-value)

seg_rich = merge(seg_rich, temp, by = c('ferret_id', 'dpi', 'type', 'ferret_weight', 
           'pair_ids', 'index_direct', 'type_day', 'segment'), all = TRUE) 

seg_rich$seg_deletion_richness[is.na(seg_rich$seg_deletion_richness)] = 0

seg_rich$segment = factor(seg_rich$segment, levels = SEGMENTS)
head(seg_rich)
```

```{r}
p3 = seg_rich %>% filter(segment %in% SEGMENTS) %>%
    ggplot(., aes(x=segment, y = seg_deletion_richness)) + 
    geom_boxplot() + 
    labs(x='segment',y='deletion richness') + 
    PlotTheme1 

print(p3)

ggsave(p3,
       filename = glue("{wkdir}/DVG_figures/segment.richness.pdf"),
       width = 5,
       height = 5, limitsize=FALSE, useDingbats = FALSE)

ggsave(p3,
       filename = glue("{wkdir}/DVG_figures/segment.richness.png"),
       width =5,
       height = 5, limitsize=FALSE) #, useDingbats = FALSE)
```

```{r}
seg_rich$seg_weight = paste0(seg_rich$segment, '_', seg_rich$ferret_weight)
seg_rich$ferret_weight = factor(seg_rich$ferret_weight, levels = c('stock','lean','obese'))
```

```{r}
seg_rich %>% filter(segment %in% SEGMENTS) %>%
    ggplot(., aes(x=segment, y = seg_deletion_richness, group = seg_weight, color = ferret_weight)) + 
    geom_boxplot() + 
    labs(x='segment',y='deletion richness') + 
    PlotTheme1 +
    weight_colScale +
    facet_grid(.~type)


p6 = seg_rich %>% filter(segment %in% SEGMENTS & dpi %in% c('d02','d04','d06','stock')) %>%
    filter(type == 'index' | type == "stock") %>%
    ggplot(., aes(x=segment, y = seg_deletion_richness, group = seg_weight, color = ferret_weight)) + 
    geom_boxplot(outlier.shape = NA) + 
    labs(x='segment',y='deletion richness') + 
    PlotTheme1 +
    weight_colScale +
    facet_grid(.~dpi)

print(p6)

ggsave(p6,
       filename = glue("{wkdir}/DVG_figures/segment.index.richness.pdf"),
       width = 8,
       height = 4, limitsize=FALSE, useDingbats = FALSE)

ggsave(p6,
       filename = glue("{wkdir}/DVG_figures/segment.index.richness.png"),
       width =8,
       height = 4, limitsize=FALSE) #, useDingbats = FALSE)


```
Test for significance
```{r}
o = filter(seg_rich, type == "index" & dpi == "d02" & segment == "H9N2_PB2" & ferret_weight == "obese")
l = filter(seg_rich, type == "index" & dpi == "d02" & segment == "H9N2_PB2" & ferret_weight == "lean")
t.test(o$seg_deletion_richness,l$seg_deletion_richness)
```

```{r}
seg_rich %>% filter(type == 'index' &  segment %in% SEGMENTS) %>%
    ungroup() %>%
    unique() %>%
    ggplot(., aes(x=type_day, y = seg_deletion_richness, color = segment, group=segment)) + 
    #geom_boxplot() + 
    geom_line(size = 1) + 
    geom_point(size = 2) + 
    labs(x='dpi (by index case)', y='DVG richness') + 
    PlotTheme1 +
    scale_color_brewer(palette = 'Set2') +
    #weight_colScale + 
    facet_grid(.~ferret_weight + ferret_id, scales = 'free', space = 'free')
```

```{r}
p4 = seg_rich %>% filter(ferret_weight == 'obese' & index_direct == 'obese_obese' & segment %in% SEGMENTS) %>%
    ungroup() %>%
    unique() %>%
    ggplot(., aes(x=type_day, y = seg_deletion_richness, color = segment, group=segment)) + 
    #geom_boxplot() + 
    geom_line(size = 1) + 
    geom_point(size = 2) + 
    labs(x='dpi (by index case)', y='DVG richness') + 
    PlotTheme1 +
    scale_color_brewer(palette = 'Set2') +
    #weight_colScale + 
    facet_grid(.~pair_ids, scales = 'free', space = 'free')

print(p4)


ggsave(p4,
       filename = glue("{wkdir}/DVG_figures/segment.obese.to.obese.richness.pdf"),
       width = 12,
       height = 4, limitsize=FALSE, useDingbats = FALSE)

ggsave(p4,
       filename = glue("{wkdir}/DVG_figures/segment.obese.to.obese.richness.png"),
       width =12,
       height = 4, limitsize=FALSE) #, useDingbats = FALSE)
```

End of Kate's code

Which DVGs are shared between stock and index?
```{r}
reps_df$DVG = paste0(reps_df$segment,"_",reps_df$DVG_group)

F17_stock = filter(reps_df ,type == "stock", cohort == "F17")
F17_stock_dvg = unique(F17_stock$DVG)
W17_stock = filter(reps_df ,type == "stock", cohort == "W17")
W17_stock_dvg = unique(W17_stock$DVG)
Sm18_stock = filter(reps_df ,type == "stock", cohort == "Sm18")
Sm18_stock_dvg = unique(Sm18_stock$DVG)
Sp19_stock = filter(reps_df ,type == "stock", cohort == "Sp19")
Sp19_stock_dvg = unique(Sp19_stock$DVG)
Sp20_stock = filter(reps_df ,type == "stock", cohort == "Sp20")
Sp20_stock_dvg = unique(Sp20_stock$DVG)

F17_index = filter(reps_df ,type == "index", cohort == "F17")
F17_index_dvg = unique(F17_index$DVG)
W17_index = filter(reps_df ,type == "index", cohort == "W17")
W17_index_dvg = unique(W17_index$DVG)
Sm18_index = filter(reps_df ,type == "index", cohort == "Sm18")
Sm18_index_dvg = unique(Sm18_index$DVG)
Sp19_index = filter(reps_df ,type == "index", cohort == "Sp19")
Sp19_index_dvg = unique(Sp19_index$DVG)
Sp20_index = filter(reps_df ,type == "index", cohort == "Sp20")
Sp20_index_dvg = unique(Sp20_index$DVG)

F17_shared = F17_index %>% filter(DVG %in% F17_stock_dvg) %>% filter((DVG %in% F17_index_dvg)) %>% unique()
F17_denovo = F17_index %>% filter((DVG %in% F17_index_dvg)) %>% filter(!(DVG %in% F17_stock_dvg)) %>% unique()

W17_shared = W17_index %>% filter(DVG %in% W17_stock_dvg) %>% filter((DVG %in% W17_index_dvg)) %>% unique()
W17_denovo = W17_index %>% filter((DVG %in% W17_index_dvg)) %>% filter(!(DVG %in% W17_stock_dvg)) %>% unique()

Sm18_shared = Sm18_index %>% filter(DVG %in% Sm18_stock_dvg) %>% filter((DVG %in% Sm18_index_dvg)) %>% unique()
Sm18_denovo = Sm18_index %>% filter((DVG %in% Sm18_index_dvg)) %>% filter(!(DVG %in% Sm18_stock_dvg)) %>% unique()

Sp19_shared = Sp19_index %>% filter(DVG %in% Sp19_stock_dvg) %>% filter((DVG %in% Sp19_index_dvg)) %>% unique()
Sp19_denovo = Sp19_index %>% filter((DVG %in% Sp19_index_dvg)) %>% filter(!(DVG %in% Sp19_stock_dvg)) %>% unique()

Sp20_shared = Sp20_index %>% filter(DVG %in% Sp20_stock_dvg) %>% filter((DVG %in% Sp20_index_dvg)) %>% unique() # still not working
Sp20_denovo = Sp20_index %>% filter((DVG %in% Sp20_index_dvg)) %>% filter(!(DVG %in% Sp20_stock_dvg)) %>% unique() # still not working

stock_shared = rbind(F17_shared,W17_shared,Sm18_shared,Sp19_shared,Sp20_shared)
index_unique = rbind(F17_denovo,W17_denovo,Sm18_denovo,Sp19_denovo,Sp20_denovo)
```

```{r}
stock_obese = filter(stock_shared, ferret_weight == "obese") 
o_dvg = unique(stock_obese$DVG)

stock_lean = filter(stock_shared, ferret_weight == "lean") 
l_dvg = unique(stock_lean$DVG)

stock_dvg <- list(Obese = o_dvg, Lean = l_dvg)

StockDVGs = ggVennDiagram(stock_dvg)
print(StockDVGs)
ggsave(StockDVGs, file = "StockDVGs.pdf", path = saveitdir)

ShockSharedDVGs = ggplot(stock_shared, aes(x = dpi, y = DVG)) +
  geom_point() +
  geom_line(aes(group = DVG)) +
  facet_grid(~segment) +
  PlotTheme1
print(ShockSharedDVGs)
ggsave(ShockSharedDVGs, file = "ShockSharedDVGs.pdf", path = saveitdir)
```
Are there diet-specific DVGs in index ferrets?
```{r}
index_obese = filter(index_unique, ferret_weight == "obese") 
o_dvg = unique(index_obese$DVG)

index_lean = filter(index_unique, ferret_weight == "lean") 
l_dvg = unique(index_lean$DVG)

diet_dvg <- list(Obese = o_dvg, Lean = l_dvg)

DietUniqueDVGs = ggVennDiagram(diet_dvg)
print(DietUniqueDVGs)
ggsave(DietUniqueDVGs, file = "DietUniqueDVGs.pdf", path = saveitdir)
```

Pulling out diet-specific DVGs
```{r}
lean = index_lean %>% 
  filter(DVG %in% l_dvg) %>% 
  filter(!(DVG %in% o_dvg)) %>%
  unique()

lean = lean %>%
  group_by(DVG) %>% 
  mutate(count = 1, totalsamp = sum(count))

mult_lean = filter(lean, totalsamp > 1) %>% 
  unique()

obese = index_unique %>% 
  filter((DVG %in% o_dvg)) %>%
  filter(!(DVG %in% l_dvg)) %>% 
  unique()

obese = obese %>%
  group_by(DVG) %>% 
  mutate(count = 1, totalsamp = sum(count))

mult_obese = filter(obese, totalsamp > 1) %>% 
  unique()
```

```{r}
lean_uniques = lean %>%
  ungroup() %>% 
  select(segment,DVG_group,GroupBoundaries,totalsamp) %>% 
  unique() %>% 
  arrange(desc(totalsamp))
print(lean_uniques)

obese_uniques = obese %>%
  ungroup() %>% 
  select(segment,DVG_group,GroupBoundaries,totalsamp) %>% 
  unique() %>% 
  arrange(desc(totalsamp))
print(obese_uniques)
```

Are DVGs transmitted?
```{r}
dvg_df = select(reps_df, ferret_day, ferret_id, dpi, ferret_weight, type, segment, DVG_group, GroupBoundaries, pair, index_direct, pair_ids) %>%
  ungroup() %>%
  unique()
dvg_df$seg_dvg = paste0(dvg_df$segment,"_",dvg_df$DVG_group)

index = filter(dvg_df, type == "index") %>% unique()
first_time = c("1794_d04","1797_d02","1913_d06","1914_d06","1980_d02","1981_d10","1986_d10","2231_d06","2232_d02","2239_d02")
early_time = c("1794_d04","1797_d02","1980_d02","2232_d02","2239_d02")
direct = filter(dvg_df, ferret_day %in% first_time) %>% unique()
 
samples = unique(index$ferret_day)
  
dvg_transmitted = data.frame()
  
for(i in samples){
  print(i)
  
  n = filter(index, ferret_day == i)
  partner = unique(n$pair_ids)
  d = filter(direct, pair_ids %in% partner)
  
  if(nrow(d) > 0){
    
    s = unique(d$ferret_day)
    print(s)
  
    comp = merge(n, d, by = c("index_direct","pair_ids","pair","segment","seg_dvg","DVG_group","GroupBoundaries"), all.x = TRUE) %>%
      count(pair_ids,ferret_day.x, ferret_day.y)
    colnames(comp) = c("pair_ids","index","contact","count")
     
    dvg_transmitted = rbind(dvg_transmitted, comp)
   
  }else(print("No transmission"))
  
}

#dvg_transmitted = dvg_transmitted %>% 
#  pivot_wider(names_from = contact, values_from = count)
```

```{r}
 contact_dvgs = filter(dvg_df, type == "direct") %>% count(ferret_id,dpi,ferret_weight) %>%
  ggplot(., aes(x = dpi, y = n, color = ferret_weight)) +
  geom_point() +
  geom_line(aes(group = ferret_id)) +
  facet_grid(~ferret_id) +
  ylab("DVG richness") +
  xlab("DPI") +
  PlotTheme1 +
  weight_colScale
print(contact_dvgs)
ggsave(contact_dvgs, file = "contact_dvgs.pdf", path = saveitdir)
```
